import json
import re
from pathlib import Path

SAMPLES_PATH = Path("samples")
boards = ["pol", "soy", "r9k"]

word_pat = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?")

def tokens(text: str) -> list[str]:
    return [m.group(0).lower() for m in word_pat.finditer(text)]


def posts_in_thread(obj):
    if isinstance(obj, list):
        return obj
    if isinstance(obj, dict):
        if isinstance(obj.get("posts"), list):
            return obj["posts"]
            
        for v in obj.values():
            if isinstance(v, list):
                return v
    return []


def stats(posts):
    n = w_total = conj_total = 0
    
    for p in posts:
        t = p.get("text", "")
        
        if not isinstance(t, str) or not t:
            continue
            
        ws = tokens(t)
        
        if not ws:
            continue
            
        n += 1
        w_total += len(ws)
        conj_total += sum(1 for w in ws if w in {"and", "or"})
    return (w_total / n if n else 0.0, conj_total / w_total if w_total else 0.0, n)


for name in boards:
    all_posts = []
    op_posts = []
    non_op_posts = []

    for fp in (SAMPLES_PATH / name).glob("*.json"):
        with fp.open("r", encoding="utf-8") as f:
            thread = json.load(f)
        
        for p in posts_in_thread(thread):
            all_posts.append(p)
            
            if p.get("post") == p.get("thread"):
                op_posts.append(p)
            else:
                non_op_posts.append(p)

    avg_all, ratio_all, n_all = stats(all_posts)
    avg_op, ratio_op, n_op = stats(op_posts)
    avg_nop, ratio_nop, n_nop = stats(non_op_posts)

    print(name)
    print("  All posts")
    print(f"    Number of posts           : {n_all}")
    print(f"    Average word count        : {avg_all:.2f}")
    print(f"    Conjunctive ratio (and/or): {ratio_all:.6f}")

    print("  OPs only")
    print(f"    Number of posts           : {n_op}")
    print(f"    Average word count        : {avg_op:.2f}")
    print(f"    Conjunctive ratio (and/or): {ratio_op:.6f}")
    
    print("  Non-OPs only")
    print(f"    Number of posts           : {n_nop}")
    print(f"    Average word count        : {avg_nop:.2f}")
    print(f"    Conjunctive ratio (and/or): {ratio_nop:.6f}")


